In [48]:
#pip install folium
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [5]:
df=pd.read_csv('Dataset .csv')
df
Out[5]:
| Restaurant ID | Restaurant Name | Country Code | City | Address | Locality | Locality Verbose | Longitude | Latitude | Cuisines | ... | Currency | Has Table booking | Has Online delivery | Is delivering now | Switch to order menu | Price range | Aggregate rating | Rating color | Rating text | Votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6317637 | Le Petit Souffle | 162 | Makati City | Third Floor, Century City Mall, Kalayaan Avenu... | Century City Mall, Poblacion, Makati City | Century City Mall, Poblacion, Makati City, Mak... | 121.027535 | 14.565443 | French, Japanese, Desserts | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.8 | Dark Green | Excellent | 314 |
| 1 | 6304287 | Izakaya Kikufuji | 162 | Makati City | Little Tokyo, 2277 Chino Roces Avenue, Legaspi... | Little Tokyo, Legaspi Village, Makati City | Little Tokyo, Legaspi Village, Makati City, Ma... | 121.014101 | 14.553708 | Japanese | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.5 | Dark Green | Excellent | 591 |
| 2 | 6300002 | Heat - Edsa Shangri-La | 162 | Mandaluyong City | Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... | Edsa Shangri-La, Ortigas, Mandaluyong City | Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... | 121.056831 | 14.581404 | Seafood, Asian, Filipino, Indian | ... | Botswana Pula(P) | Yes | No | No | No | 4 | 4.4 | Green | Very Good | 270 |
| 3 | 6318506 | Ooma | 162 | Mandaluyong City | Third Floor, Mega Fashion Hall, SM Megamall, O... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.056475 | 14.585318 | Japanese, Sushi | ... | Botswana Pula(P) | No | No | No | No | 4 | 4.9 | Dark Green | Excellent | 365 |
| 4 | 6314302 | Sambo Kojin | 162 | Mandaluyong City | Third Floor, Mega Atrium, SM Megamall, Ortigas... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.057508 | 14.584450 | Japanese, Korean | ... | Botswana Pula(P) | Yes | No | No | No | 4 | 4.8 | Dark Green | Excellent | 229 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9546 | 5915730 | Naml۱ Gurme | 208 | ��stanbul | Kemanke�� Karamustafa Pa��a Mahallesi, R۱ht۱m ... | Karak�_y | Karak�_y, ��stanbul | 28.977392 | 41.022793 | Turkish | ... | Turkish Lira(TL) | No | No | No | No | 3 | 4.1 | Green | Very Good | 788 |
| 9547 | 5908749 | Ceviz A��ac۱ | 208 | ��stanbul | Ko��uyolu Mahallesi, Muhittin ��st�_nda�� Cadd... | Ko��uyolu | Ko��uyolu, ��stanbul | 29.041297 | 41.009847 | World Cuisine, Patisserie, Cafe | ... | Turkish Lira(TL) | No | No | No | No | 3 | 4.2 | Green | Very Good | 1034 |
| 9548 | 5915807 | Huqqa | 208 | ��stanbul | Kuru�_e��me Mahallesi, Muallim Naci Caddesi, N... | Kuru�_e��me | Kuru�_e��me, ��stanbul | 29.034640 | 41.055817 | Italian, World Cuisine | ... | Turkish Lira(TL) | No | No | No | No | 4 | 3.7 | Yellow | Good | 661 |
| 9549 | 5916112 | A���k Kahve | 208 | ��stanbul | Kuru�_e��me Mahallesi, Muallim Naci Caddesi, N... | Kuru�_e��me | Kuru�_e��me, ��stanbul | 29.036019 | 41.057979 | Restaurant Cafe | ... | Turkish Lira(TL) | No | No | No | No | 4 | 4.0 | Green | Very Good | 901 |
| 9550 | 5927402 | Walter's Coffee Roastery | 208 | ��stanbul | Cafea��a Mahallesi, Bademalt۱ Sokak, No 21/B, ... | Moda | Moda, ��stanbul | 29.026016 | 40.984776 | Cafe | ... | Turkish Lira(TL) | No | No | No | No | 2 | 4.0 | Green | Very Good | 591 |
9551 rows × 21 columns
- 1.Explore the dataset and identify the numberof rows and columns
In [7]:
df.head()
Out[7]:
| Restaurant ID | Restaurant Name | Country Code | City | Address | Locality | Locality Verbose | Longitude | Latitude | Cuisines | ... | Currency | Has Table booking | Has Online delivery | Is delivering now | Switch to order menu | Price range | Aggregate rating | Rating color | Rating text | Votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6317637 | Le Petit Souffle | 162 | Makati City | Third Floor, Century City Mall, Kalayaan Avenu... | Century City Mall, Poblacion, Makati City | Century City Mall, Poblacion, Makati City, Mak... | 121.027535 | 14.565443 | French, Japanese, Desserts | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.8 | Dark Green | Excellent | 314 |
| 1 | 6304287 | Izakaya Kikufuji | 162 | Makati City | Little Tokyo, 2277 Chino Roces Avenue, Legaspi... | Little Tokyo, Legaspi Village, Makati City | Little Tokyo, Legaspi Village, Makati City, Ma... | 121.014101 | 14.553708 | Japanese | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.5 | Dark Green | Excellent | 591 |
| 2 | 6300002 | Heat - Edsa Shangri-La | 162 | Mandaluyong City | Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... | Edsa Shangri-La, Ortigas, Mandaluyong City | Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... | 121.056831 | 14.581404 | Seafood, Asian, Filipino, Indian | ... | Botswana Pula(P) | Yes | No | No | No | 4 | 4.4 | Green | Very Good | 270 |
| 3 | 6318506 | Ooma | 162 | Mandaluyong City | Third Floor, Mega Fashion Hall, SM Megamall, O... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.056475 | 14.585318 | Japanese, Sushi | ... | Botswana Pula(P) | No | No | No | No | 4 | 4.9 | Dark Green | Excellent | 365 |
| 4 | 6314302 | Sambo Kojin | 162 | Mandaluyong City | Third Floor, Mega Atrium, SM Megamall, Ortigas... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.057508 | 14.584450 | Japanese, Korean | ... | Botswana Pula(P) | Yes | No | No | No | 4 | 4.8 | Dark Green | Excellent | 229 |
5 rows × 21 columns
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9551 entries, 0 to 9550 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Restaurant ID 9551 non-null int64 1 Restaurant Name 9551 non-null object 2 Country Code 9551 non-null int64 3 City 9551 non-null object 4 Address 9551 non-null object 5 Locality 9551 non-null object 6 Locality Verbose 9551 non-null object 7 Longitude 9551 non-null float64 8 Latitude 9551 non-null float64 9 Cuisines 9542 non-null object 10 Average Cost for two 9551 non-null int64 11 Currency 9551 non-null object 12 Has Table booking 9551 non-null object 13 Has Online delivery 9551 non-null object 14 Is delivering now 9551 non-null object 15 Switch to order menu 9551 non-null object 16 Price range 9551 non-null int64 17 Aggregate rating 9551 non-null float64 18 Rating color 9551 non-null object 19 Rating text 9551 non-null object 20 Votes 9551 non-null int64 dtypes: float64(3), int64(5), object(13) memory usage: 1.5+ MB
In [11]:
df.shape
Out[11]:
(9551, 21)
- 9551 Rows & 21 Columns
- 2 Check for missing values in each column and
handle them accordingly.
In [13]:
df.isnull().sum()
Out[13]:
Restaurant ID 0 Restaurant Name 0 Country Code 0 City 0 Address 0 Locality 0 Locality Verbose 0 Longitude 0 Latitude 0 Cuisines 9 Average Cost for two 0 Currency 0 Has Table booking 0 Has Online delivery 0 Is delivering now 0 Switch to order menu 0 Price range 0 Aggregate rating 0 Rating color 0 Rating text 0 Votes 0 dtype: int64
There are only 9 missing values in 'Cuisine' column which is very less
So, we can ignore or just replace these with 'Not Specified'
In [15]:
df['Cuisines'].fillna('Not Specified',inplace=True)
C:\Users\dhana\AppData\Local\Temp\ipykernel_2304\3188408958.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['Cuisines'].fillna('Not Specified',inplace=True)
In [17]:
df.isnull().sum()
Out[17]:
Restaurant ID 0 Restaurant Name 0 Country Code 0 City 0 Address 0 Locality 0 Locality Verbose 0 Longitude 0 Latitude 0 Cuisines 0 Average Cost for two 0 Currency 0 Has Table booking 0 Has Online delivery 0 Is delivering now 0 Switch to order menu 0 Price range 0 Aggregate rating 0 Rating color 0 Rating text 0 Votes 0 dtype: int64
- 3.Perform data type conversion if necessary. Analyze the distribution of the target variable ("Aggregate rating") and identify any class imbalances.
In [19]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9551 entries, 0 to 9550 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Restaurant ID 9551 non-null int64 1 Restaurant Name 9551 non-null object 2 Country Code 9551 non-null int64 3 City 9551 non-null object 4 Address 9551 non-null object 5 Locality 9551 non-null object 6 Locality Verbose 9551 non-null object 7 Longitude 9551 non-null float64 8 Latitude 9551 non-null float64 9 Cuisines 9551 non-null object 10 Average Cost for two 9551 non-null int64 11 Currency 9551 non-null object 12 Has Table booking 9551 non-null object 13 Has Online delivery 9551 non-null object 14 Is delivering now 9551 non-null object 15 Switch to order menu 9551 non-null object 16 Price range 9551 non-null int64 17 Aggregate rating 9551 non-null float64 18 Rating color 9551 non-null object 19 Rating text 9551 non-null object 20 Votes 9551 non-null int64 dtypes: float64(3), int64(5), object(13) memory usage: 1.5+ MB
- No need to do any data type conversion
In [21]:
# target variables 'Aggregate rating'
target='Aggregate rating'
# descriptive sttistics
print(df[target].describe())
count 9551.000000 mean 2.666370 std 1.516378 min 0.000000 25% 2.500000 50% 3.200000 75% 3.700000 max 4.900000 Name: Aggregate rating, dtype: float64
In [23]:
# Box plot
plt.figure(figsize=(8,5))
sns.boxplot(x=df[target])
plt.title('Box Plot')
plt.xlabel('Aggregative Rating')
plt.show()
In [25]:
# Histogram
plt.figure(figsize=(8,5))
sns.histplot(df[target],bins=30,kde=True,color='green')
plt.title('Histogram')
plt.xlabel('Aggregate Rating')
plt.ylabel('Frequency')
plt.show()
- No clas imbalance
In [27]:
df.describe()
Out[27]:
| Restaurant ID | Country Code | Longitude | Latitude | Average Cost for two | Price range | Aggregate rating | Votes | |
|---|---|---|---|---|---|---|---|---|
| count | 9.551000e+03 | 9551.000000 | 9551.000000 | 9551.000000 | 9551.000000 | 9551.000000 | 9551.000000 | 9551.000000 |
| mean | 9.051128e+06 | 18.365616 | 64.126574 | 25.854381 | 1199.210763 | 1.804837 | 2.666370 | 156.909748 |
| std | 8.791521e+06 | 56.750546 | 41.467058 | 11.007935 | 16121.183073 | 0.905609 | 1.516378 | 430.169145 |
| min | 5.300000e+01 | 1.000000 | -157.948486 | -41.330428 | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
| 25% | 3.019625e+05 | 1.000000 | 77.081343 | 28.478713 | 250.000000 | 1.000000 | 2.500000 | 5.000000 |
| 50% | 6.004089e+06 | 1.000000 | 77.191964 | 28.570469 | 400.000000 | 2.000000 | 3.200000 | 31.000000 |
| 75% | 1.835229e+07 | 1.000000 | 77.282006 | 28.642758 | 700.000000 | 2.000000 | 3.700000 | 131.000000 |
| max | 1.850065e+07 | 216.000000 | 174.832089 | 55.976980 | 800000.000000 | 4.000000 | 4.900000 | 10934.000000 |
In [29]:
numerical_stats = df.describe()
print(numerical_stats)
Restaurant ID Country Code Longitude Latitude \
count 9.551000e+03 9551.000000 9551.000000 9551.000000
mean 9.051128e+06 18.365616 64.126574 25.854381
std 8.791521e+06 56.750546 41.467058 11.007935
min 5.300000e+01 1.000000 -157.948486 -41.330428
25% 3.019625e+05 1.000000 77.081343 28.478713
50% 6.004089e+06 1.000000 77.191964 28.570469
75% 1.835229e+07 1.000000 77.282006 28.642758
max 1.850065e+07 216.000000 174.832089 55.976980
Average Cost for two Price range Aggregate rating Votes
count 9551.000000 9551.000000 9551.000000 9551.000000
mean 1199.210763 1.804837 2.666370 156.909748
std 16121.183073 0.905609 1.516378 430.169145
min 0.000000 1.000000 0.000000 0.000000
25% 250.000000 1.000000 2.500000 5.000000
50% 400.000000 2.000000 3.200000 31.000000
75% 700.000000 2.000000 3.700000 131.000000
max 800000.000000 4.000000 4.900000 10934.000000
In [31]:
df[['Average Cost for two','Price range','Aggregate rating','Votes']].describe()
Out[31]:
| Average Cost for two | Price range | Aggregate rating | Votes | |
|---|---|---|---|---|
| count | 9551.000000 | 9551.000000 | 9551.000000 | 9551.000000 |
| mean | 1199.210763 | 1.804837 | 2.666370 | 156.909748 |
| std | 16121.183073 | 0.905609 | 1.516378 | 430.169145 |
| min | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
| 25% | 250.000000 | 1.000000 | 2.500000 | 5.000000 |
| 50% | 400.000000 | 2.000000 | 3.200000 | 31.000000 |
| 75% | 700.000000 | 2.000000 | 3.700000 | 131.000000 |
| max | 800000.000000 | 4.000000 | 4.900000 | 10934.000000 |
- 2 Explore the distribution of categorical
variables like "Country Code,
" "City,
" and
"Cuisines. "
Identify the top cuisines and cities with the highest number of restaurants.
In [33]:
# Explore the distribution of categorical variables like "Country Code
plt.figure(figsize=(8,5))
sns.countplot(x='Country Code',data=df)
plt.title('Distribution of Resturants by Country Code')
plt.xlabel('Country Code')
plt.ylabel('No of Resturants')
plt.show()
- The majority of restaurants are located in Country Code 1.Followed by the second-highest concentration in Country Code 216
In [35]:
# Top Countries with the highest number of restaurants
top_countries=df['Country Code'].value_counts().head()
print('Top 5 Countries with the Highest number of restaurants')
print(top_countries)
Top 5 Countries with the Highest number of restaurants Country Code 1 8652 216 434 215 80 30 60 214 60 Name: count, dtype: int64
In [37]:
# Explore the distribution of 'City'
plt.figure(figsize=(8,5))
sns.countplot(x='City',data=df,order=df['City'].value_counts().head(20).index)
plt.title('Distribution of Resturants by City')
plt.xlabel('City')
plt.ylabel('No of Resturants')
plt.xticks(rotation=45)
plt.show()
In [39]:
# Explore the distribution of 'Cuisines'
plt.figure(figsize=(15,6))
cuisines_count=df['Cuisines'].value_counts()
cuisines_count.head(20).plot(kind='bar', color=sns.color_palette('Set2'))
plt.title('Top 20 Cuisines with the highest number of Restaurants')
plt.xlabel('Cuisines')
plt.ylabel('No of Resturants')
plt.xticks(rotation=45)
plt.show()
In [41]:
# Top Cuisines and Cities
# Top cities with the highest number of restaurants
top_cities=df['City'].value_counts().head(10)
print('Top 10 cities with the Highest Number of Restaurants : ')
print(top_cities)
Top 10 cities with the Highest Number of Restaurants : City New Delhi 5473 Gurgaon 1118 Noida 1080 Faridabad 251 Ghaziabad 25 Bhubaneshwar 21 Amritsar 21 Ahmedabad 21 Lucknow 21 Guwahati 21 Name: count, dtype: int64
In [43]:
# Top cuisines with the highest number of restaurants
top_cuisines=cuisines_count.head(10)
print('Top 10 cuisines with the highest number of restaurants:')
print(top_cuisines)
Top 10 cuisines with the highest number of restaurants: Cuisines North Indian 936 North Indian, Chinese 511 Chinese 354 Fast Food 354 North Indian, Mughlai 334 Cafe 299 Bakery 218 North Indian, Mughlai, Chinese 197 Bakery, Desserts 170 Street Food 149 Name: count, dtype: int64
In [45]:
city_counts = df['City'].value_counts()
print(city_counts)
City
New Delhi 5473
Gurgaon 1118
Noida 1080
Faridabad 251
Ghaziabad 25
...
Panchkula 1
Mc Millan 1
Mayfield 1
Macedon 1
Vineland Station 1
Name: count, Length: 141, dtype: int64
Level 1 - Task 3:-¶
Task : Geospatial Analysis
- 1.Visualize the locations of restaurants on a map using latitude and longitude information
In [13]:
#pip install shapely geopandas
In [19]:
#pip install geopandas fiona pyproj
In [47]:
# Locations of restaurants on a map using latitude and longitude information
# Import the necessary libraries
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
In [49]:
import folium
restaurant_map = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=6)
for index, row in df.iterrows():
popup_text = f"{row['Restaurant Name']} - Rating: {row['Votes']}"
folium.Marker([row['Latitude'], row['Longitude']], popup=popup_text).add_to(restaurant_map)
restaurant_map.save('restaurant_map.html')
plt.scatter(df['Longitude'], df['Latitude'], s=df['Votes'] * 20, alpha=0.7)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Restaurant Locations and Ratings')
plt.show()
city_distribution = df['City'].value_counts()
plt.bar(city_distribution.index, city_distribution.values)
plt.xlabel('City')
plt.ylabel('Number of Restaurants')
plt.title('Restaurant Distribution across Cities')
plt.xticks(rotation=45)
plt.show()
In [51]:
import seaborn as sns
plt.figure(figsize=(15, 6))
sns.barplot(x=city_counts.index,y=city_counts.values)
plt.xticks(rotation=90)
plt.xlabel("City")
plt.ylabel("Number of Restaurants")
plt.title("Distribution of Restaurants across Cities/Countries")
plt.show()
In [53]:
plt.figure(figsize=(8,5))
sns.countplot(y=df['City'],order=df.City.value_counts().head(10).index,palette='Set2')
plt.xlabel('Number of Restaurants')
plt.ylabel('name of Cities')
plt.title('Distribution of Restaurants Acress Cities')
plt.show()
C:\Users\dhana\AppData\Local\Temp\ipykernel_2304\850872848.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.countplot(y=df['City'],order=df.City.value_counts().head(10).index,palette='Set2')
In [55]:
map_location = [df['Latitude'].iloc[0], df['Longitude'].iloc[0]]
map_restaurent = folium.Map(location=map_location, zoom_start=12)
for index, row in df.iterrows():
folium.Marker(location=[row['Latitude'], row['Longitude']],
popup=row['Restaurant Name'],
icon=folium.Icon(icon='cutlery', prefix='fa')).add_to(map_restaurent)
map_restaurent
Out[55]:
Make this Notebook Trusted to load map: File -> Trust Notebook